notebook.community

Edit and run



In [43]:

    
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
from datetime import datetime, timedelta, date
import operator
import matplotlib.pyplot as plt
from collections import namedtuple
%matplotlib notebook



In [15]:

    
events = pd.read_csv('data/rain_events_ohare.csv')
events = events[(events['duration_hrs'] > 1) | (events['total_precip'] > 0.08)]
events['start_time'] = pd.to_datetime(events['start_time'])
events['end_time'] = pd.to_datetime(events['end_time'])
events.head()









    Out[15]:






  
    
      
      start_time
      end_time
      duration_hrs
      total_precip
    
  
  
    
      0
      1970-03-19 18:00:00
      1970-03-19 22:00:00
      4.0
      0.18
    
    
      1
      1970-03-25 21:00:00
      1970-03-26 07:00:00
      10.0
      0.27
    
    
      2
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1.0
      0.24
    
    
      4
      1970-04-19 03:00:00
      1970-04-19 10:00:00
      7.0
      0.29
    
    
      5
      1970-04-30 00:00:00
      1970-05-01 01:00:00
      25.0
      0.53



In [16]:

    
events = events.set_index('start_time')
events['start_time'] = events.index.values
events['avg_intensity'] = events['total_precip'] / events['duration_hrs']
def find_year(timestamp):
    return timestamp.year
events['year'] = events['start_time'].apply(find_year)
events.head()









    Out[16]:






  
    
      
      end_time
      duration_hrs
      total_precip
      start_time
      avg_intensity
      year
    
    
      start_time
      
      
      
      
      
      
    
  
  
    
      1970-03-19 18:00:00
      1970-03-19 22:00:00
      4.0
      0.18
      1970-03-19 18:00:00
      0.045000
      1970
    
    
      1970-03-25 21:00:00
      1970-03-26 07:00:00
      10.0
      0.27
      1970-03-25 21:00:00
      0.027000
      1970
    
    
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1.0
      0.24
      1970-04-13 03:00:00
      0.240000
      1970
    
    
      1970-04-19 03:00:00
      1970-04-19 10:00:00
      7.0
      0.29
      1970-04-19 03:00:00
      0.041429
      1970
    
    
      1970-04-30 00:00:00
      1970-05-01 01:00:00
      25.0
      0.53
      1970-04-30 00:00:00
      0.021200
      1970



In [17]:

    
events['avg_intensity'].plot()









    














    











    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x8cf1e80>



In [26]:

    
events[['year', 'avg_intensity']].groupby('year').mean().plot(kind='bar', title='Average event intensity by year')









    














    











    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0xad21d30>



In [44]:

    
events[['year', 'duration_hrs']].groupby('year').sum().plot(kind='bar', title='Total hours of rainfall per year')









    














    











    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0xd6d33c8>



In [48]:

    
events['year'].value_counts().sort_index().cumsum().plot(kind='bar', title='Cumulative number of events over time')









    














    











    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0xe679cc0>

The number of events over the years isn't really changing, pretty constant overall.

The Chicago Climate Action Plan expects less frequent summer rains, so let's test that out.



In [49]:

    
# Get the season of each event based on the start date
# This code is copied from http://stackoverflow.com/questions/16139306/determine-season-given-timestamp-in-python-using-datetime

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]

def get_season(timestamp):
    if isinstance(timestamp, datetime):
        timestamp = timestamp.date()
    timestamp = timestamp.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= timestamp <= end)

events['season'] = events['start_time'].apply(get_season)
events.head()









    Out[49]:






  
    
      
      end_time
      duration_hrs
      total_precip
      start_time
      avg_intensity
      year
      season
    
    
      start_time
      
      
      
      
      
      
      
    
  
  
    
      1970-03-19 18:00:00
      1970-03-19 22:00:00
      4.0
      0.18
      1970-03-19 18:00:00
      0.045000
      1970
      winter
    
    
      1970-03-25 21:00:00
      1970-03-26 07:00:00
      10.0
      0.27
      1970-03-25 21:00:00
      0.027000
      1970
      spring
    
    
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1.0
      0.24
      1970-04-13 03:00:00
      0.240000
      1970
      spring
    
    
      1970-04-19 03:00:00
      1970-04-19 10:00:00
      7.0
      0.29
      1970-04-19 03:00:00
      0.041429
      1970
      spring
    
    
      1970-04-30 00:00:00
      1970-05-01 01:00:00
      25.0
      0.53
      1970-04-30 00:00:00
      0.021200
      1970
      spring



In [50]:

    
summer_events = events[events['season'] == 'summer']
summer_events.head()









    Out[50]:






  
    
      
      end_time
      duration_hrs
      total_precip
      start_time
      avg_intensity
      year
      season
    
    
      start_time
      
      
      
      
      
      
      
    
  
  
    
      1970-07-19 15:00:00
      1970-07-19 16:00:00
      1.0
      0.97
      1970-07-19 15:00:00
      0.97
      1970
      summer
    
    
      1970-07-27 15:00:00
      1970-07-27 16:00:00
      1.0
      0.09
      1970-07-27 15:00:00
      0.09
      1970
      summer
    
    
      1970-07-28 18:00:00
      1970-07-28 19:00:00
      1.0
      0.15
      1970-07-28 18:00:00
      0.15
      1970
      summer
    
    
      1970-07-30 00:00:00
      1970-07-30 07:00:00
      7.0
      0.84
      1970-07-30 00:00:00
      0.12
      1970
      summer
    
    
      1970-08-03 00:00:00
      1970-08-03 01:00:00
      1.0
      0.13
      1970-08-03 00:00:00
      0.13
      1970
      summer



In [52]:

    
summer_events['year'].value_counts().sort_index().plot(kind='bar', title='Number of summer events over time')









    














    











    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0xffd90b8>



In [53]:

    
# TODO: Left off here



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [7]:

    
# Let's take a look at summer showers.  Are they happening with less frequencly over the years?
summer_events = events[events['season'] == 'summer']
summer_events.head()









    Out[7]:






  
    
      
      start_time
      end_time
      duration_hrs
      total_precip
      season
      avg_intensity
      year
    
  
  
    
      15
      1970-07-19 15:00:00
      1970-07-19 16:00:00
      1.0
      0.97
      summer
      0.97
      1970
    
    
      16
      1970-07-27 15:00:00
      1970-07-27 16:00:00
      1.0
      0.09
      summer
      0.09
      1970
    
    
      17
      1970-07-28 18:00:00
      1970-07-28 19:00:00
      1.0
      0.15
      summer
      0.15
      1970
    
    
      18
      1970-07-30 00:00:00
      1970-07-30 07:00:00
      7.0
      0.84
      summer
      0.12
      1970
    
    
      19
      1970-08-03 00:00:00
      1970-08-03 01:00:00
      1.0
      0.13
      summer
      0.13
      1970



In [8]:

    
(summer_events['year'].value_counts().sort_index()).plot(kind='bar')









    














    











    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x4851160>



In [9]:

    
# No!



In [24]:

    
events.plot()









    














    











    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x95d5e10>



In [17]:

    
summer_events[['year', 'avg_intensity']].groupby('year').mean()









    Out[17]:






  
    
      
      avg_intensity
    
    
      year
      
    
  
  
    
      1970
      0.205799
    
    
      1971
      0.397500
    
    
      1972
      0.144155
    
    
      1973
      0.142399
    
    
      1974
      0.128182
    
    
      1975
      0.161194
    
    
      1976
      0.128193
    
    
      1977
      0.162803
    
    
      1978
      0.160575
    
    
      1979
      0.194020
    
    
      1980
      0.175186
    
    
      1981
      0.230929
    
    
      1982
      0.183712
    
    
      1983
      0.130849
    
    
      1984
      0.243835
    
    
      1985
      0.232875
    
    
      1986
      0.166415
    
    
      1987
      0.207753
    
    
      1988
      0.156492
    
    
      1989
      0.181051
    
    
      1990
      0.127498
    
    
      1991
      0.151146
    
    
      1992
      0.152012
    
    
      1993
      0.175254
    
    
      1994
      0.180714
    
    
      1995
      0.178958
    
    
      1996
      0.142795
    
    
      1997
      0.191356
    
    
      1998
      0.233324
    
    
      1999
      0.170130
    
    
      2000
      0.193310
    
    
      2001
      0.321542
    
    
      2002
      0.250802
    
    
      2003
      0.137312
    
    
      2004
      0.110622
    
    
      2005
      0.155365
    
    
      2006
      0.209591
    
    
      2007
      0.148382
    
    
      2008
      0.121732
    
    
      2009
      0.166667
    
    
      2010
      0.231535
    
    
      2011
      0.296641
    
    
      2012
      0.167478
    
    
      2013
      0.192209
    
    
      2014
      0.292894
    
    
      2015
      0.111071
    
    
      2016
      0.198331



In [16]:

    
summer_events[['year', 'avg_intensity']].groupby('year').mean().plot(kind='bar')









    














    











    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x84b6320>



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [35]:

    
summer_events['hours_between_events'] = (summer_events['start_time'] - summer_events['end_time'].shift()).astype('timedelta64[h]')
summer_events.head()









    



d:\data_science_projects\chicagorain\virtualenvs\nyear-venv\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':






    Out[35]:






  
    
      
      start_time
      end_time
      duration_hrs
      total_precip
      season
      intensity
      avg_intensity
      year
      hours_between_events
    
  
  
    
      15
      1970-07-19 15:00:00
      1970-07-19 16:00:00
      1.0
      0.97
      summer
      0.97
      0.97
      1970
      NaN
    
    
      16
      1970-07-27 15:00:00
      1970-07-27 16:00:00
      1.0
      0.09
      summer
      0.09
      0.09
      1970
      191.0
    
    
      17
      1970-07-28 18:00:00
      1970-07-28 19:00:00
      1.0
      0.15
      summer
      0.15
      0.15
      1970
      26.0
    
    
      18
      1970-07-30 00:00:00
      1970-07-30 07:00:00
      7.0
      0.84
      summer
      0.12
      0.12
      1970
      29.0
    
    
      19
      1970-08-03 00:00:00
      1970-08-03 01:00:00
      1.0
      0.13
      summer
      0.13
      0.13
      1970
      89.0



In [39]:

    
# Average time between events - in hours
(summer_events.groupby('year')['hours_between_events'].mean()).plot(kind='bar')









    














    











    Out[39]:





<matplotlib.axes._subplots.AxesSubplot at 0x9a2c4a8>



In [54]:

    
# Events per season per year
per_season = {year: {'winter': 0, 'spring': 0, 'autumn': 0, 'summer': 0} for year in range(1970,2017)}
for index, event in events.iterrows():
    per_season[event['year']][event['season']] += 1
events_per_season_by_year = pd.DataFrame(per_season)
events_per_season_by_year = events_per_season_by_year.transpose()
events_per_season_by_year.head()



In [59]:

    
events_per_season_by_year['summer'].plot(kind='bar')









    














    











    Out[59]:





<matplotlib.axes._subplots.AxesSubplot at 0xe528160>



In [ ]:

	start_time	end_time	duration_hrs	total_precip
0	1970-03-19 18:00:00	1970-03-19 22:00:00	4.0	0.18
1	1970-03-25 21:00:00	1970-03-26 07:00:00	10.0	0.27
2	1970-04-13 03:00:00	1970-04-13 04:00:00	1.0	0.24
4	1970-04-19 03:00:00	1970-04-19 10:00:00	7.0	0.29
5	1970-04-30 00:00:00	1970-05-01 01:00:00	25.0	0.53

	end_time	duration_hrs	total_precip	start_time	avg_intensity	year
start_time
1970-03-19 18:00:00	1970-03-19 22:00:00	4.0	0.18	1970-03-19 18:00:00	0.045000	1970
1970-03-25 21:00:00	1970-03-26 07:00:00	10.0	0.27	1970-03-25 21:00:00	0.027000	1970
1970-04-13 03:00:00	1970-04-13 04:00:00	1.0	0.24	1970-04-13 03:00:00	0.240000	1970
1970-04-19 03:00:00	1970-04-19 10:00:00	7.0	0.29	1970-04-19 03:00:00	0.041429	1970
1970-04-30 00:00:00	1970-05-01 01:00:00	25.0	0.53	1970-04-30 00:00:00	0.021200	1970

	end_time	duration_hrs	total_precip	start_time	avg_intensity	year	season
start_time
1970-07-19 15:00:00	1970-07-19 16:00:00	1.0	0.97	1970-07-19 15:00:00	0.97	1970	summer
1970-07-27 15:00:00	1970-07-27 16:00:00	1.0	0.09	1970-07-27 15:00:00	0.09	1970	summer
1970-07-28 18:00:00	1970-07-28 19:00:00	1.0	0.15	1970-07-28 18:00:00	0.15	1970	summer
1970-07-30 00:00:00	1970-07-30 07:00:00	7.0	0.84	1970-07-30 00:00:00	0.12	1970	summer
1970-08-03 00:00:00	1970-08-03 01:00:00	1.0	0.13	1970-08-03 00:00:00	0.13	1970	summer

	avg_intensity
year
1970	0.205799
1971	0.397500
1972	0.144155
1973	0.142399
1974	0.128182
1975	0.161194
1976	0.128193
1977	0.162803
1978	0.160575
1979	0.194020
1980	0.175186
1981	0.230929
1982	0.183712
1983	0.130849
1984	0.243835
1985	0.232875
1986	0.166415
1987	0.207753
1988	0.156492
1989	0.181051
1990	0.127498
1991	0.151146
1992	0.152012
1993	0.175254
1994	0.180714
1995	0.178958
1996	0.142795
1997	0.191356
1998	0.233324
1999	0.170130
2000	0.193310
2001	0.321542
2002	0.250802
2003	0.137312
2004	0.110622
2005	0.155365
2006	0.209591
2007	0.148382
2008	0.121732
2009	0.166667
2010	0.231535
2011	0.296641
2012	0.167478
2013	0.192209
2014	0.292894
2015	0.111071
2016	0.198331